安装并且加载必要的包

需要自行下载安装一些必要的R包,包括我们的测试数据集来源的R包scRNAseq ,以及4个单细胞转录组数据处理包!

因为大量学员在中国大陆,所以需要特别的R包安装方法,就是切换镜像后再下载R包。参考:http://www.bio-info-trainee.com/3727.html

options()$repos  ## 查看使用install.packages安装时的默认镜像
##     CRAN 
## "@CRAN@"
options()$BioC_mirror ##查看使用bioconductor的默认镜像
## NULL
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") ##指定镜像,这个是中国科技大学镜像
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) ##指定install.packages安装镜像,这个是清华镜像
options()$repos 
##                                         CRAN 
## "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"
options()$BioC_mirror
## [1] "https://mirrors.ustc.edu.cn/bioc/"
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager") 
if(!require('Seurat')){
  BiocManager::install('Seurat',ask = F,update = F)
}
if(!require('scater')){
  BiocManager::install(c( 'scater'),ask = F,update = F)
}
if(!require('monocle')){
  BiocManager::install(c( 'monocle'),ask = F,update = F)
}
if(!require('scRNAseq')){
  BiocManager::install(c( 'scRNAseq'),ask = F,update = F)
}
if(!require('SC3')){
  BiocManager::install(c( 'SC3'),ask = F,update = F)
}
if(!require('M3Drop')){
  BiocManager::install(c( 'M3Drop'),ask = F,update = F)
}
if(!require('ggpubr')){
  BiocManager::install(c( 'ggpubr'),ask = F,update = F)
}

安装成功后就可以加载R包:

rm(list = ls())  
options(warn=-1)  
suppressMessages(library(scater))
suppressMessages(library(Seurat))
suppressMessages(library(monocle))
suppressMessages(library(scRNAseq)) 
suppressMessages(library(SC3)) 
suppressMessages(library(M3Drop)) 

简单了解数据集

这个包内置的是 Pollen et al. 2014 数据集,人类单细胞细胞,分成4类,分别是 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞,理解这些需要一定的生物学背景知识,如果不感兴趣,可以略过。

这个数据集很出名,截止2019年1月已经有近400的引用了,后面的人开发R包算法都会在其上面做测试,本例子只使用了数据集的4种细胞类型而已,因为 scRNAseq 这个R包就提供了这些,完整的数据是 23730 features, 301 samples, 地址为https://hemberg-lab.github.io/scRNA.seq.datasets/human/tissues/ , 这个网站非常值得推荐,简直是一个宝藏。

这里面的表达矩阵是由 RSEM (Li and Dewey 2011) 软件根据 hg38 RefSeq transcriptome 得到的,总是130个文库,每个细胞测了两次,测序深度不一样(非常的不一样)

library(scRNAseq)
## ----- Load Example Data -----
data(fluidigm) 
ct <- floor(assays(fluidigm)$rsem_counts)
ct[1:4,1:4] 
##          SRR1275356 SRR1274090 SRR1275251 SRR1275287
## A1BG              0          0          0          0
## A1BG-AS1          0          0          0          0
## A1CF              0          0          0          0
## A2M               0          0          0         33
sample_ann <- as.data.frame(colData(fluidigm))
DT::datatable(sample_ann,
              
            rownames= FALSE,extensions = c('Scroller'), 
            options = list(  
  pageLength = 10, 
  lengthMenu = list(c(10, 50, 100,-1), c('10', '50','100', 'All')),
  columnDefs = list(list(className = 'dt-center', targets = 0 :8)),
  scrollX = TRUE,
  fixedHeader = TRUE,
  fixedColumns = TRUE ,
  deferRender = TRUE 
),
filter = 'top',
escape = FALSE
)

先探索表型信息

前面说到,这个数据集是130个文库,每个细胞测了两次,测序深度不一样,这130个细胞,分成4类,分别是: pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这种孕期细胞。

批量,粗略的看一看各个细胞的一些统计学指标的分布情况

library(ggpubr)
box <- lapply(colnames(sample_ann[,1:19]),function(i) {
    dat <-  sample_ann[,i,drop=F] 
    dat$sample=rownames(dat)
    dat$group='all cells'
    ## 画boxplot 
   p <- ggboxplot(dat, x = "group", y = i, 
                add = "jitter" )
 p
})
plot_grid(plotlist=box, ncol=5 )

# ggsave(file="stat_all_cells.pdf")

很明显,他们可以有根据来进行分组,这里不再演示。 不过通常的文章并不会考虑如此多的细节,这里重点是批量,代码技巧非常值得你们学校。

因为进行了简单探索,对表型数据就有了把握,接下来可以进行一定程度的过滤,因为细节太多,这里重点是批量,代码技巧非常值得你们学校。

pa <- colnames(sample_ann[,c(1:9,11:16,18,19)])
tf <- lapply(pa,function(i) {
 # i=pa[1]
  dat <-  sample_ann[,i]  
  dat <- abs(log10(dat))
  fivenum(dat)
  (up <- mean(dat)+2*sd(dat))
  (down <- mean(dat)- 2*sd(dat) ) 
  valid <- ifelse(dat > down & dat < up, 1,0 ) 
})

tf <- do.call(cbind,tf)
choosed_cells <- apply(tf,1,function(x) all(x==1))
table(sample_ann$Biological_Condition)
## 
##   GW16   GW21 GW21+3    NPC 
##     52     16     32     30
sample_ann=sample_ann[choosed_cells,]
table(sample_ann$Biological_Condition)
## 
##   GW16   GW21 GW21+3    NPC 
##     36     11     23     29
ct <- ct[,choosed_cells]

再探索基因表达情况

ct[1:4,1:4] 
##          SRR1274090 SRR1275287 SRR1275364 SRR1275269
## A1BG              0          0          0          0
## A1BG-AS1          0          0          0          0
## A1CF              0          0          0          0
## A2M               0         33          0         51
counts <- ct
fivenum(apply(counts,1,function(x) sum(x>0) ))
##      A1CF     OR8G1 LINC01003    MRPS36     YWHAZ 
##         0         0         4        26        99
boxplot(apply(counts,1,function(x) sum(x>0) ))

fivenum(apply(counts,2,function(x) sum(x>0) ))
## SRR1275365 SRR1275345 SRR1275248 SRR1275273 SRR1274125 
##     1566.0     3043.0     4002.0     5706.5     8024.0
hist(apply(counts,2,function(x) sum(x>0) ))

choosed_genes=apply(counts,1,function(x) sum(x>0) )>0
table(choosed_genes)
## choosed_genes
## FALSE  TRUE 
##  9496 16759
counts <- counts[choosed_genes,]

单细胞转录组分析流程介绍

注意,这里的 seurat是2.x版本,同理,monocle也是2版本。

用法 Seurat scater monocle
创建R包要求的对象 CreateSeuratObject SingleCellExperiment newCellDataSet
QC and selecting cell 创建矩阵的同时,可以选择过滤参数min.cell,min.gene等。还有FilterCells函数可以去除不合格的细胞。 calculateQCMetrics()函数,其中的feature_controls参数可以指定过滤指标,然后有一系列的可视化函数 用基础R函数进行初步过滤,还可以用detectGenes()函数加上subset()过滤
表达量的标准化或者归一化 NormalizeData calculateCPM()等系列函数 estimateSizeFactors()还有estimateDispersions
寻找重要的基因 FindVariableGenes 没有看到专门的函数,可以借用R基础函数 differentialGeneTest()函数
去除干扰因素 ScaleData 借用limma包的 removeBatchEffect 函数 去除干扰因素的功能被包装在降维函数中
降维 RunPCA或者RunTSNE runPCA或者runTSNE,runDiffusionMap reduceDimension函数,可以选择多种参数
降维后可视化 VizPCA和PCElbowPlot;PCAPlot或者TSNEPlot plotPCA和plotTSNE等等 plot_cell_trajectory()或plot_genes_in_pseudotime
分类群cluster FindClusters 并没有包装聚类函数,而且辅助其它R包,或者R基础函数 clusterCells
聚类后找每个细胞亚群的标志基因 FindMarkers和FindAllMarkers函数 借助SC3包 newCellTypeHierarchy classifyCells
calculateMarkerSpecificity

step1: 创建对象

首先为 scater 包创建对象

pheno_data <- as.data.frame(colData(fluidigm))
ct <- floor(assays(fluidigm)$rsem_counts)
## 这里需要把Pollen的表达矩阵做成我们的 scater 要求的对象
# data("sc_example_counts")
# data("sc_example_cell_info") 
# 你也可以尝试该R包自带的数据集。
# 参考 https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/vignette-intro.R
sce_scater <- SingleCellExperiment(
    assays = list(counts = ct), 
    colData = pheno_data
    )
sce_scater
## class: SingleCellExperiment 
## dim: 26255 130 
## metadata(0):
## assays(1): counts
## rownames(26255): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
## reducedDimNames(0):
## spikeNames(0):
# 后面所有的分析都是基于 sce_scater 这个变量
# 是一个 SingleCellExperiment 对象,被很多单细胞R包采用。

然后为 seurat 包创建对象

meta <- as.data.frame(colData(fluidigm))
ct <- floor(assays(fluidigm)$rsem_counts)
counts <- ct
sce_seurat <- CreateSeuratObject(raw.data = counts, 
                             meta.data =meta,
                             min.cells = 3, 
                             min.genes = 200, 
                             project = "Pollen")
sce_seurat
## An object of class seurat in project Pollen 
##  14656 genes across 130 samples.
## 后续所有的分析都基于这个 sce_seurat 变量,是一个对象 

最后为 monocle 包创建对象

ct <- floor(assays(fluidigm)$rsem_counts)
gene_ann <- data.frame(
  gene_short_name = row.names(ct), 
  row.names = row.names(ct)
)
sample_ann=as.data.frame(colData(fluidigm))
pd <- new("AnnotatedDataFrame",
          data=sample_ann)
fd <- new("AnnotatedDataFrame",
          data=gene_ann)
sce_monocle <- newCellDataSet(
  ct, 
  phenoData = pd,
  featureData =fd,
  expressionFamily = negbinomial.size(),
  lowerDetectionLimit=1)
sce_monocle
## CellDataSet (storageMode: environment)
## assayData: 26255 features, 130 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
##   varLabels: NREADS NALIGNED ... Size_Factor (29 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
##   fvarLabels: gene_short_name
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

再次回顾一下这3个对象。

sce_seurat
## An object of class seurat in project Pollen 
##  14656 genes across 130 samples.
sce_scater
## class: SingleCellExperiment 
## dim: 26255 130 
## metadata(0):
## assays(1): counts
## rownames(26255): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
## reducedDimNames(0):
## spikeNames(0):
sce_monocle
## CellDataSet (storageMode: environment)
## assayData: 26255 features, 130 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
##   varLabels: NREADS NALIGNED ... Size_Factor (29 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
##   fvarLabels: gene_short_name
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

step2: 质量控制

seurat 包,

mito.genes <- grep(pattern = "^MT-", 
                   x = rownames(sce_seurat@data), 
                   value = TRUE)
# 恰好这个例子的表达矩阵里面没有线粒体基因
percent.mito <- Matrix::colSums(sce_seurat@raw.data[mito.genes, ]) / Matrix::colSums(sce_seurat@raw.data)
## 也可以加入很多其它属性,比如 ERCC 等。

# AddMetaData adds columns to object@meta.data, and is a great place to stash QC stats
sce_seurat <- AddMetaData(object = sce_seurat, 
                          metadata = percent.mito,
                         col.name = "percent.mito")
VlnPlot(object = sce_seurat, 
        features.plot = c("nGene", "nUMI", "percent.mito"), 
        group.by = 'Biological_Condition', nCol = 3)

GenePlot(object = sce_seurat, gene1 = "nUMI", gene2 = "nGene")

CellPlot(sce_seurat,
         sce_seurat@cell.names[3],
         sce_seurat@cell.names[4],
         do.ident = FALSE)

# FilterCells函数
# sce_seurat
# sce_seurat <- FilterCells(object = sce_seurat, 
#                     subset.names = c("nGene", "percent.mito"), 
#                     low.thresholds = c(200, -Inf), 
#                     high.thresholds = c(2500, 0.05))
# sce_seurat

在scater包,

这里仅仅是演示 scater 包最简单的质量控制代码,详细代码见:https://bioconductor.org/packages/release/bioc/vignettes/scater/inst/doc/vignette-qc.html

genes <- rownames(rowData(sce_scater))
genes[grepl('^MT-',genes)]
## character(0)
genes[grepl('^ERCC-',genes)] 
## character(0)
exprs(sce_scater) <- log2(
    calculateCPM(sce_scater ) + 1)
keep_feature <- rowSums(exprs(sce_scater) > 0) > 0
table(keep_feature)
## keep_feature
## FALSE  TRUE 
##  9159 17096
sce_scater <- sce_scater[keep_feature,]
sce_scater 
## class: SingleCellExperiment 
## dim: 17096 130 
## metadata(0):
## assays(2): counts logcounts
## rownames(17096): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(130): SRR1275356 SRR1274090 ... SRR1275366 SRR1275261
## colData names(28): NREADS NALIGNED ... Cluster1 Cluster2
## reducedDimNames(0):
## spikeNames(0):
sce_scater <- calculateQCMetrics(sce_scater)
plotHighestExprs(sce_scater, exprs_values = "counts")

plotExprsFreqVsMean(sce_scater)

monocle包,

sce_monocle
## CellDataSet (storageMode: environment)
## assayData: 26255 features, 130 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
##   varLabels: NREADS NALIGNED ... Size_Factor (29 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: A1BG A1BG-AS1 ... ZZZ3 (26255 total)
##   fvarLabels: gene_short_name
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
## 起初是 
sce_monocle <- detectGenes(sce_monocle, min_expr = 0.1)
print(head(fData(sce_monocle)))
##          gene_short_name num_cells_expressed
## A1BG                A1BG                  10
## A1BG-AS1        A1BG-AS1                   2
## A1CF                A1CF                   1
## A2M                  A2M                  21
## A2M-AS1          A2M-AS1                   3
## A2ML1              A2ML1                   9
expressed_genes <- row.names(subset(fData(sce_monocle),
                                    num_cells_expressed >= 5))
length(expressed_genes)
## [1] 13385
sce_monocle <- sce_monocle[expressed_genes,]
sce_monocle
## CellDataSet (storageMode: environment)
## assayData: 13385 features, 130 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
##   varLabels: NREADS NALIGNED ... num_genes_expressed (30 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: A1BG A2M ... ZZZ3 (13385 total)
##   fvarLabels: gene_short_name num_cells_expressed
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
# 过滤基因后是  
tmp=pData(sce_monocle)
fivenum(tmp[,1])
## [1]    91616   232899   892209  8130850 14477100
fivenum(tmp[,30])
## [1] 1418.0 2961.0 3841.5 5381.0 8221.0
# 这里并不需要过滤细胞,如果想过滤,就自己摸索阈值,然后挑选细胞即可。
# 这里留下来了所有的细胞。
valid_cells <- row.names(pData(sce_monocle) )
sce_monocle <- sce_monocle[,valid_cells]
sce_monocle 
## CellDataSet (storageMode: environment)
## assayData: 13385 features, 130 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: SRR1275356 SRR1274090 ... SRR1275261 (130 total)
##   varLabels: NREADS NALIGNED ... num_genes_expressed (30 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: A1BG A2M ... ZZZ3 (13385 total)
##   fvarLabels: gene_short_name num_cells_expressed
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

step3: 表达量的标准化和归一化

在 seurat包需要先假定平均细胞测序文库大小,这里是 10000

sce_seurat <- NormalizeData(object = sce_seurat, 
                     normalization.method = "LogNormalize", 
                     scale.factor = 10000,
                     display.progress = F)

在scater包:

assays(sce_scater)
## List of length 2
## names(2): counts logcounts
counts(sce_scater) <- assays(sce_scater)$counts
norm_exprs(sce_scater) <- log2(calculateCPM(sce_scater, use_size_factors = FALSE) + 1)

stand_exprs(sce_scater) <- log2(calculateCPM(sce_scater, use_size_factors = FALSE) + 1)

tpm(sce_scater) <- calculateTPM(sce_scater, effective_length = 5e4)

cpm(sce_scater) <- calculateCPM(sce_scater, use_size_factors = FALSE)

assays(sce_scater)
## List of length 6
## names(6): counts logcounts norm_exprs stand_exprs tpm cpm

详细理论见:https://hemberg-lab.github.io/scRNA.seq.course/cleaning-the-expression-matrix.html#normalization-theory

  • 7.8.1 Raw
  • 7.8.2 CPM
  • 7.8.3 Size-factor (RLE)
  • 7.8.4 Upperquantile
  • 7.8.5 TMM
  • 7.8.6 scran
  • 7.8.7 Downsampling

monocle包,

colnames(phenoData(sce_monocle)@data)
##  [1] "NREADS"                       "NALIGNED"                    
##  [3] "RALIGN"                       "TOTAL_DUP"                   
##  [5] "PRIMER"                       "INSERT_SZ"                   
##  [7] "INSERT_SZ_STD"                "COMPLEXITY"                  
##  [9] "NDUPR"                        "PCT_RIBOSOMAL_BASES"         
## [11] "PCT_CODING_BASES"             "PCT_UTR_BASES"               
## [13] "PCT_INTRONIC_BASES"           "PCT_INTERGENIC_BASES"        
## [15] "PCT_MRNA_BASES"               "MEDIAN_CV_COVERAGE"          
## [17] "MEDIAN_5PRIME_BIAS"           "MEDIAN_3PRIME_BIAS"          
## [19] "MEDIAN_5PRIME_TO_3PRIME_BIAS" "sample_id.x"                 
## [21] "Lane_ID"                      "LibraryName"                 
## [23] "avgLength"                    "spots"                       
## [25] "Biological_Condition"         "Coverage_Type"               
## [27] "Cluster1"                     "Cluster2"                    
## [29] "Size_Factor"                  "num_genes_expressed"
sce_monocle <- estimateSizeFactors(sce_monocle)
sce_monocle <- estimateDispersions(sce_monocle)
colnames(phenoData(sce_monocle)@data)
##  [1] "NREADS"                       "NALIGNED"                    
##  [3] "RALIGN"                       "TOTAL_DUP"                   
##  [5] "PRIMER"                       "INSERT_SZ"                   
##  [7] "INSERT_SZ_STD"                "COMPLEXITY"                  
##  [9] "NDUPR"                        "PCT_RIBOSOMAL_BASES"         
## [11] "PCT_CODING_BASES"             "PCT_UTR_BASES"               
## [13] "PCT_INTRONIC_BASES"           "PCT_INTERGENIC_BASES"        
## [15] "PCT_MRNA_BASES"               "MEDIAN_CV_COVERAGE"          
## [17] "MEDIAN_5PRIME_BIAS"           "MEDIAN_3PRIME_BIAS"          
## [19] "MEDIAN_5PRIME_TO_3PRIME_BIAS" "sample_id.x"                 
## [21] "Lane_ID"                      "LibraryName"                 
## [23] "avgLength"                    "spots"                       
## [25] "Biological_Condition"         "Coverage_Type"               
## [27] "Cluster1"                     "Cluster2"                    
## [29] "Size_Factor"                  "num_genes_expressed"

step4: 去除干扰因素

在 seurat 包,去除一些文库大小,线粒体基因含量,ERCC含量等因素的功能被包装在 ScaleData 函数里面,前提是需要被去除的因素提供 AddMetaData 函数添加到了对象。

sce_seurat <- ScaleData(object = sce_seurat, 
                 vars.to.regress = c("nUMI"),
                 display.progress = F)
## 所有放在 vars.to.regress 参数的变量均可以被去除

在scater包, 主要是可视化那些干扰因素:

sce_scater <- runPCA(sce_scater)
# colnames(colData(sce_scater))
plotPCA(
    sce_scater, 
    colour_by = "Biological_Condition",
    size_by = "NALIGNED"
)

# 还有 plotQC 函数。

如果需要去除干扰因素,可以借用limma包的 removeBatchEffect 函数

library(limma)
batch <- rep(1:2, each=20)
corrected <- removeBatchEffect(logcounts(example_sce), block=batch)
assay(example_sce, "corrected_logcounts") <- corrected

在monocle包,去除干扰因素的功能被包装在降维函数中,示例如下:

# 放在 residualModelFormulaStr 里面的是需要去除的
cds <- reduceDimension(cds, max_components = 2, num_dim = 2,
                        reduction_method = 'tSNE',
                        residualModelFormulaStr = "~Biological_Condition + num_genes_expressed",
                        verbose = T)
cds <- clusterCells(cds, num_clusters = 2)
plot_cell_clusters(cds, 1, 2, color = "Biological_Condition")

## 上面去除了Biological_Condition,所以接下来聚类它们就被打散了。

step5: 判断重要的基因

寻找波动比较明显的基因,后续用这些基因而非全部基因进行分析,主要为了降低计算量。

seurat 包,必须要先 normalization ,然后才能进行FindVariableGenes 计算,代码如下:

sce_seurat <- FindVariableGenes(object = sce_seurat, 
                                mean.function = ExpMean,
                                dispersion.function = LogVMR, 
                         x.low.cutoff = 0.0125, 
                         x.high.cutoff = 3, 
                         y.cutoff = 0.5)

# 通过调整参数可以得到不同数量的 var.genes
length(sce_seurat@var.genes)
## [1] 4944

scater包, 没有看到专门的函数,可以借用R基础函数。

monocle 包中,同样也不是所有的基因都有作用,所以先进行挑选,合适的基因才在后续分析中用来降维聚类。

disp_table <- dispersionTable(sce_monocle)
# 也可以先挑选差异基因
unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)
dim(unsup_clustering_genes)
## [1] 12911     4
sce_monocle <- setOrderingFilter(sce_monocle,
                                 unsup_clustering_genes$gene_id)
plot_ordering_genes(sce_monocle) 

plot_pc_variance_explained(sce_monocle, return_all = F) 

# norm_method='log'

后面做降维的时候的 num_dim 参数选择基于上面的PCA图

step6: 多种降维算法

seurat 包, 降维之前必须要先 Run ScaleData() , 每个降维算法都被单独包装为函数了。

sce_seurat <- RunPCA(object = sce_seurat, 
              pc.genes = sce_seurat@var.genes, 
              do.print = TRUE, 
              pcs.print = 1:5, 
              genes.print = 5)
## [1] "PC1"
## [1] "MLLT11"    "KIDINS220" "SLA"       "CALM1"     "FAM110B"  
## [1] ""
## [1] "HMGB2"  "LIX1"   "LDHA"   "TRIM59" "ENO1"  
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "CENPF"    "SHISA2"   "CDK1"     "HIST1H4C" "BARD1"   
## [1] ""
## [1] "ECSCR"  "MYCT1"  "ELK3"   "IMPG2"  "ADGRF5"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "ADGRV1"  "GPX3"    "FAM107A" "HOPX"    "PTN"    
## [1] ""
## [1] "BCAT1"  "EFNA5"  "DLK1"   "TUBA1C" "CXADR" 
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "S100A6"   "LOX"      "ADAMTS16" "CA2"      "DKK3"    
## [1] ""
## [1] "FAM50A" "DDX54"  "ARFIP2" "IPP"    "PXMP2" 
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "ASB16-AS1"  "CENPL"      "ATP1A1-AS1" "CHD9"       "KLHL31"    
## [1] ""
## [1] "CD44"  "CA2"   "SYNJ2" "LRP10" "TTC38"
## [1] ""
## [1] ""
sce_seurat@dr
## $pca
## A dimensional reduction object with key PC 
##  Number of dimensions: 20 
##  Projected dimensional reduction calculated: FALSE 
##  Jackstraw run: FALSE
tmp <- sce_seurat@dr$pca@gene.loadings
sce_seurat <- RunTSNE(object = sce_seurat, 
               dims.use = 1:10, 
               do.fast = TRUE, 
               perplexity=10)
sce_seurat@dr 
## $pca
## A dimensional reduction object with key PC 
##  Number of dimensions: 20 
##  Projected dimensional reduction calculated: FALSE 
##  Jackstraw run: FALSE 
## 
## $tsne
## A dimensional reduction object with key tSNE_ 
##  Number of dimensions: 2 
##  Projected dimensional reduction calculated: FALSE 
##  Jackstraw run: FALSE

scater包,

sce <- runPCA(sce_scater)
# 这里并没有进行任何基因的挑选,就直接进行了PCA,与 seurat包不一样。
reducedDimNames(sce) 
## [1] "PCA"
sce <- runPCA(sce, ncomponents=20)
# Perplexity of 10 just chosen here arbitrarily. 
set.seed(1000)
# 这里的这个 perplexity 参数很重要
sce <- runTSNE(sce, perplexity=30)
sce <- runDiffusionMap(sce)
reducedDimNames(sce)
## [1] "PCA"          "TSNE"         "DiffusionMap"
sce_scater=sce

monocle包,降维函数就是 reduceDimension , 它包装这一个去除干扰因素的功能, 可供选择的降维算法包括: "DDRTree", "ICA", "tSNE", "SimplePPT", "L1-graph", "SGL-tree"

# 放在 residualModelFormulaStr 参数里面的是需要去除的
sce_monocle <- reduceDimension(sce_monocle, 
                               max_components = 2, num_dim = 2,
                        reduction_method = 'tSNE', 
                        verbose = T)

step7: 可视化降维结果

在 seurat 包, 两个降维算法被单独包装为两个函数,所以可视化也是两个函数。

sce_seurat@dr 
## $pca
## A dimensional reduction object with key PC 
##  Number of dimensions: 20 
##  Projected dimensional reduction calculated: FALSE 
##  Jackstraw run: FALSE 
## 
## $tsne
## A dimensional reduction object with key tSNE_ 
##  Number of dimensions: 2 
##  Projected dimensional reduction calculated: FALSE 
##  Jackstraw run: FALSE
PCAPlot(sce_seurat, dim.1 = 1, dim.2 = 2,
        group.by = 'Biological_Condition')

TSNEPlot(sce_seurat,group.by = 'Biological_Condition')

VizPCA( sce_seurat, pcs.use = 1:2)

PCElbowPlot(object = sce_seurat)

sce_seurat <- ProjectPCA(sce_seurat, do.print = FALSE)
PCHeatmap(object = sce_seurat, 
          pc.use = 1, 
          cells.use = ncol(sce_seurat@data), 
          do.balanced = TRUE, 
          label.columns = FALSE)

在scater包, 同样的是多个降维函数和多个可视化函数

plotTSNE(sce_scater,  
         colour_by = "Biological_Condition" )

plotPCA(sce_scater, 
        colour_by = "Biological_Condition" )

plotPCA(sce_scater, ncomponents = 4,  
        colour_by = "Biological_Condition" )

plotDiffusionMap(sce_scater,  
                 colour_by = "Biological_Condition" )

在monocle包,有趣的是降维后必须先分群才能进行可视化。

sce_monocle <- clusterCells(sce_monocle, num_clusters = 4)
## Distance cutoff calculated to 0.5035647
plot_cell_clusters(sce_monocle, 1, 2, color = "Biological_Condition")

step8: 多种聚类算法

聚类后就可以根据阈值进行分群

seurat 包, 重点: 需要搞懂这里的 resolution 参数,而且降维算法可以选PCA或者ICA , 分群算法也可以选择。

sce_seurat <- FindClusters(object = sce_seurat, 
                    reduction.type = "pca", 
                    dims.use = 1:10, force.recalc = T,
                    resolution = 0.9, print.output = 0,
                    save.SNN = TRUE)
PrintFindClustersParams(sce_seurat)
## Parameters used in latest FindClusters calculation run on: 2019-03-21 17:35:14
## =============================================================================
## Resolution: 0.9
## -----------------------------------------------------------------------------
## Modularity Function    Algorithm         n.start         n.iter
##      1                   1                 100             10
## -----------------------------------------------------------------------------
## Reduction used          k.param          prune.SNN
##      pca                 30                0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
table(sce_seurat@meta.data$res.0.9)
## 
##  0  1  2 
## 60 36 34

scater包, 并没有包装聚类函数,而且辅助其它R包,或者R基础函数:

  • SC3
  • pcaReduce
  • tSNE + kmeans
  • SNN-Cliq
  • SINCERA

最常用的是无缝连接 SC3 包:

library(SC3) # BiocManager::install('SC3')
sce <- sc3_estimate_k(sce_scater)
metadata(sce)$sc3$k_estimation
## [1] 24
rowData(sce)$feature_symbol <- rownames(rowData(sce))

一步运行sc3的所有分析, 相当耗费时间

这里kn表示的预估聚类数, 考虑到数据集是已知的,我们强行设置为4组, 具体数据要具体考虑。

# 耗费时间
kn <- 4 ## 这里可以选择 3:5 看多种分类结果。
sc3_cluster <- "sc3_4_clusters"
Sys.time()
## [1] "2019-03-21 17:35:14 CST"
sce <- sc3(sce, ks = kn, biology = TRUE)
Sys.time()
## [1] "2019-03-21 17:36:27 CST"

monocle包,如下:

sce_monocle <- clusterCells(sce_monocle, num_clusters = 4)
## Distance cutoff calculated to 0.5035647
plot_cell_clusters(sce_monocle)

plot_cell_clusters(sce_monocle, 1, 2, color = "Biological_Condition")

可供选择的聚类算法包括:densityPeak, louvian and DDRTree

step9: 聚类后找每个细胞亚群的标志基因

seurat包,

sce.markers <- FindAllMarkers(object = sce_seurat, only.pos = TRUE, min.pct = 0.25, 
                              thresh.use = 0.25)
# DT::datatable(sce.markers)
library(dplyr)
sce.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
## # A tibble: 6 x 7
## # Groups:   cluster [3]
##      p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene     
##      <dbl>     <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>    
## 1 1.01e- 5      1.84 0.467 0.114  1.48e- 1 0       ERBB4    
## 2 7.88e- 3      1.71 0.3   0.114  1.00e+ 0 0       PDZRN3   
## 3 2.30e-22      2.43 0.806 0      3.37e-18 1       DLK1     
## 4 1.94e-18      2.47 0.917 0.202  2.84e-14 1       BCAT1    
## 5 2.91e-14      2.23 0.882 0.292  4.26e-10 2       MEF2C    
## 6 6.21e- 7      1.86 0.588 0.229  9.10e- 3 2       LINC00643
top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)

可以单独可视化这些标志基因。

# setting slim.col.label to TRUE will print just the cluster IDS instead of# every cell name
DoHeatmap(object = sce_seurat, 
          genes.use = top10$gene, 
          slim.col.label = TRUE, 
          remove.key = TRUE)

scater包,可视化展示部分, kn就是聚类数,就能看到标志基因了。

热图: 比较先验分类和SC3的聚类的一致性

sc3_plot_consensus(sce, k = kn, show_pdata = c("Biological_Condition",sc3_cluster))

展示表达量信息

sc3_plot_expression(sce, k = kn, show_pdata =  c("Biological_Condition",sc3_cluster))

展示可能的标记基因

sc3_plot_markers(sce, k = kn, show_pdata =  c("Biological_Condition",sc3_cluster))

在PCA上展示SC3的聚类结果

plotPCA(sce, colour_by =  sc3_cluster )

# sc3_interactive(sce)

monocle包,应该是没有找标志基因的函数,但是有推断差异基因的函数,而且它多一个功能,就是进行发育轨迹的推断。 推断发育轨迹才是monocle的拿手好戏,也是它荣升为3大R包的核心竞争力。

第一步: 挑选合适的基因. 有多个方法,例如提供已知的基因集,这里选取统计学显著的差异基因列表

Sys.time()
## [1] "2019-03-21 17:37:06 CST"
cds=sce_monocle
diff_test_res <- differentialGeneTest(cds,
                                      fullModelFormulaStr = "~Biological_Condition")
Sys.time()
## [1] "2019-03-21 17:39:49 CST"
# 可以看到运行耗时

# Select genes that are significant at an FDR < 10%
sig_genes <- subset(diff_test_res, qval < 0.1)
head(sig_genes[,c("gene_short_name", "pval", "qval")] )
##       gene_short_name         pval         qval
## A1BG             A1BG 4.112065e-04 1.460722e-03
## A2M               A2M 4.251744e-08 4.266086e-07
## AACS             AACS 2.881832e-03 8.275761e-03
## AADAT           AADAT 1.069794e-02 2.621123e-02
## AAGAB           AAGAB 1.156771e-07 1.021331e-06
## AAMP             AAMP 7.626789e-05 3.243869e-04
ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
cds <- setOrderingFilter(cds, ordering_genes)
plot_ordering_genes(cds) 

第二步: 降维。降维的目的是为了更好的展示数据。函数里提供了很多种方法, 不同方法的最后展示的图都不太一样, 其中“DDRTree”是Monocle2使用的默认方法

cds <- reduceDimension(cds, max_components = 2,
                            method = 'DDRTree')

第三步: 对细胞进行排序

cds <- orderCells(cds)

最后两个可视化函数,对结果进行可视化

plot_cell_trajectory(cds, color_by = "Biological_Condition")  

可以很明显看到细胞的发育轨迹,正好对应 pluripotent stem cells 分化而成的 neural progenitor cells (“NPC”) ,还有 “GW16” and “GW21” ,“GW21+3” 这时间进展的孕期细胞。

plot_genes_in_pseudotime可以展现marker基因

最开始挑选合适基因,除了我们演示的找统计学显著的差异表达基因这个方法外,还可以于已知的标记基因,主要是基于生物学背景知识。

如果是已知基因列表,就需要自己读取外包文件,导入R里面来分析。

step10: 继续分类

只需要挑选前面步骤分类好的细胞,去表达矩阵里面进行筛选细胞,重新走一遍上面的流程即可。

一些总结:

首先是:seurat总结

counts矩阵进来后被包装为对象,方便操作。

然后一定要经过 NormalizeDataScaleData 的操作

函数 FindVariableGenes 可以挑选适合进行下游分析的基因集。

函数 RunPCARunTSNE 进行降维

函数 FindClusters 直接就分群了,非常方便 函数 FindAllMarkers 可以对分群后各个亚群找标志基因。

函数 FeaturePlot 可以展示不同基因在所有细胞的表达量 函数 VlnPlot 可以展示不同基因在不同分群的表达量差异情况 函数 DoHeatmap 可以选定基因集后绘制热图

使用M3Drop包

首先构建M3Drop需要的对象

library(M3Drop) 
Normalized_data <- M3DropCleanData(counts, 
                                   labels = sample_ann$Biological_Condition , 
                                   is.counts=TRUE, min_detected_genes=2000)
dim(Normalized_data$data)
## [1] 13958   124
length(Normalized_data$labels)
## [1] 124
class(Normalized_data)
## [1] "list"
str(Normalized_data)
## List of 2
##  $ data  : num [1:13958, 1:124] 0 0 0.486 0 0.486 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:13958] "A1BG" "A2M" "A2ML1" "AAAS" ...
##   .. ..$ : chr [1:124] "SRR1275356" "SRR1274090" "SRR1275251" "SRR1275287" ...
##  $ labels: chr [1:124] "GW16" "NPC" "GW16" "GW21+3" ...

这个包设计比较简单,并没有构建S4对象,只是一个简单的list而已。

统计学算法 Michaelis-Menten

需要深入读该文章,了解其算法,这里略过,总之它对单细胞转录组的表达矩阵进行了一系列的统计检验。

fits <- M3DropDropoutModels(Normalized_data$data)

# Sum absolute residuals
data.frame(MM=fits$MMFit$SAr, Logistic=fits$LogiFit$SAr,
           DoubleExpo=fits$ExpoFit$SAr) 
##     MM Logistic DoubleExpo
## 1 1607     1625       4075
# Sum squared residuals
data.frame(MM=fits$MMFit$SSr, Logistic=fits$LogiFit$SSr,
           DoubleExpo=fits$ExpoFit$SSr)
##    MM Logistic DoubleExpo
## 1 378      332       1989

找差异基因

DE_genes <- M3DropDifferentialExpression(Normalized_data$data, 
                                         mt_method="fdr", mt_threshold=0.01)

dim(DE_genes)
## [1] 263   3
head(DE_genes)
##          Gene      p.value      q.value
## ABCA1   ABCA1 1.381813e-04 0.0080700163
## ABCE1   ABCE1 1.956343e-05 0.0017964893
## ABLIM1 ABLIM1 7.353186e-06 0.0009082812
## ACAT2   ACAT2 1.886555e-05 0.0017438766
## ACVR1   ACVR1 1.259054e-05 0.0012970374
## AMER2   AMER2 5.698985e-06 0.0007231494

这里是针对上面的统计结果来的

针对差异基因画热图

par(mar=c(1,1,1,1)) 
heat_out <- M3DropExpressionHeatmap(DE_genes$Gene, Normalized_data$data, 
                                    cell_labels = Normalized_data$labels)

可视化了解一下找到的差异基因在不同的细胞类型的表达分布情况。

聚类

这里可以重新聚类后,针对自己找到的类别来分别找marker基因,不需要使用测试数据自带的表型信息。

cell_populations <- M3DropGetHeatmapCellClusters(heat_out, k=4)
library("ROCR") 
marker_genes <- M3DropGetMarkers(Normalized_data$data, cell_populations)
table(cell_populations,Normalized_data$labels)
##                 
## cell_populations GW16 GW21 GW21+3 NPC
##                1    0    0      0  30
##                2   23    6      8   0
##                3   21   10      0   0
##                4    2    0     24   0

每个类别的marker genes

head(marker_genes[marker_genes$Group==4,],20) 
##                AUC Group         pval
## MEF2C    0.9701727     4 1.155536e-15
## KIAA1598 0.9638932     4 3.177906e-14
## CALM1    0.9595761     4 6.736008e-13
## DPYSL3   0.9160126     4 7.070075e-11
## STMN2    0.9156201     4 5.197202e-11
## ADCY1    0.9150314     4 2.830176e-12
## DLG2     0.9085557     4 8.154496e-13
## MLLT11   0.9054160     4 2.279087e-10
## MEG3     0.9034537     4 6.217989e-11
## ZNF286B  0.9032575     4 2.403252e-10
## ETNK1    0.8975667     4 4.993506e-10
## CLASP2   0.8869702     4 9.814153e-10
## RTN1     0.8861852     4 1.097743e-09
## PHACTR3  0.8857928     4 2.491380e-12
## BHLHE22  0.8826531     4 5.787225e-13
## KIFAP3   0.8802983     4 3.942177e-10
## MIR100HG 0.8795133     4 2.732201e-09
## GRIN2B   0.8779435     4 1.667996e-15
## GRIA2    0.8763736     4 4.161204e-10
## KIF3C    0.8748038     4 4.317676e-10
marker_genes[rownames(marker_genes)=="FOS",] 
##          AUC Group        pval
## FOS 0.832712     2 2.99323e-09

也可以针对这些 marker genes去画热图,当然,得根据AUC和P值来挑选符合要求的差异基因去绘图。

par(mar=c(1,1,1,1)) 
choosed_marker_genes=as.character(unlist(lapply(
  split(marker_genes,marker_genes$Group), 
  function(x) (rownames(head(x,20)))
                                                )))
heat_out <- M3DropExpressionHeatmap(choosed_marker_genes, 
                                    Normalized_data$data,
                                    cell_labels =  cell_populations)

如果遇到Error in plot.new() : figure margins too large报错,则单独将heat_out这行命令复制出来运行

对感兴趣基因集进行注释

通常是GO/KEGG等数据库,通常是超几何分布,GSEA,GSVA等算法。

拿到基因集后走我GitHub代码即可:https://github.com/jmzeng1314/GEO 简单的例子如下:

library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
# 下面的 gene_up 是一个 entrez ID的向量,约 500左右的 自定义的基因集
## 下面的 gene_all 也是一个 entrez ID的向量,约10000左右的背景基因,就我们的scRNA检测到的全部基因。
  ###   over-representation test
  kk.up <- enrichKEGG(gene         = gene_up,
                      organism     = 'hsa',
                      universe     = gene_all,
                      pvalueCutoff = 0.9,
                      qvalueCutoff =0.9)
  head(kk.up)[,1:6]
  dotplot(kk.up );ggsave('kk.up.dotplot.png')

其它单细胞R包

包括:

不一一讲解,具体有需求,就仔细研读说明书,其实最后都是R语言熟练与否。

显示运行环境

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] splines   parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] ROCR_1.0-7                  gplots_3.0.1               
##  [3] bindrcpp_0.2.2              dplyr_0.7.6                
##  [5] ggpubr_0.1.8                magrittr_1.5               
##  [7] M3Drop_1.8.1                numDeriv_2016.8-1          
##  [9] SC3_1.10.1                  scRNAseq_1.8.0             
## [11] monocle_2.10.1              DDRTree_0.1.5              
## [13] irlba_2.3.2                 VGAM_1.0-6                 
## [15] scater_1.10.0               SingleCellExperiment_1.4.0 
## [17] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
## [19] BiocParallel_1.14.2         matrixStats_0.54.0         
## [21] Biobase_2.40.0              GenomicRanges_1.34.0       
## [23] GenomeInfoDb_1.18.1         IRanges_2.16.0             
## [25] S4Vectors_0.20.1            BiocGenerics_0.28.0        
## [27] Seurat_2.3.4                Matrix_1.2-15              
## [29] cowplot_0.9.3               ggplot2_3.0.0              
## 
## loaded via a namespace (and not attached):
##   [1] ggthemes_4.0.1           prabclus_2.2-6          
##   [3] R.methodsS3_1.7.1        pkgmaker_0.27           
##   [5] tidyr_0.8.1              acepack_1.4.1           
##   [7] bit64_0.9-7              knitr_1.20              
##   [9] R.utils_2.6.0            data.table_1.11.4       
##  [11] rpart_4.1-13             RCurl_1.95-4.11         
##  [13] doParallel_1.0.14        metap_1.0               
##  [15] snow_0.4-2               RANN_2.6.1              
##  [17] combinat_0.0-8           proxy_0.4-22            
##  [19] bit_1.1-14               httpuv_1.4.5            
##  [21] assertthat_0.2.0         viridis_0.5.1           
##  [23] hms_0.4.2                evaluate_0.11           
##  [25] promises_1.0.1           fansi_0.3.0             
##  [27] DEoptimR_1.0-8           caTools_1.17.1.1        
##  [29] readxl_1.1.0             igraph_1.2.2            
##  [31] htmlwidgets_1.2          sparsesvd_0.1-4         
##  [33] purrr_0.2.5              crosstalk_1.0.0         
##  [35] backports_1.1.2          trimcluster_0.1-2.1     
##  [37] gbRd_0.4-11              TTR_0.23-4              
##  [39] abind_1.4-5              RcppEigen_0.3.3.4.0     
##  [41] withr_2.1.2              robustbase_0.93-3       
##  [43] checkmate_1.8.5          vcd_1.4-4               
##  [45] xts_0.11-2               mclust_5.4.1            
##  [47] cluster_2.0.7-1          ape_5.2                 
##  [49] segmented_0.5-3.0        lazyeval_0.2.1          
##  [51] laeken_0.5.0             crayon_1.3.4            
##  [53] hdf5r_1.0.1              pkgconfig_2.0.2         
##  [55] slam_0.1-44              labeling_0.3            
##  [57] nlme_3.1-137             vipor_0.4.5             
##  [59] nnet_7.3-12              bindr_0.1.1             
##  [61] rlang_0.2.2              diptest_0.75-7          
##  [63] registry_0.5             doSNOW_1.0.16           
##  [65] cellranger_1.1.0         rprojroot_1.3-2         
##  [67] lmtest_0.9-36            rngtools_1.3.1          
##  [69] carData_3.0-1            Rhdf5lib_1.4.2          
##  [71] boot_1.3-20              zoo_1.8-3               
##  [73] base64enc_0.1-3          beeswarm_0.2.3          
##  [75] ggridges_0.5.0           pheatmap_1.0.10         
##  [77] png_0.1-7                viridisLite_0.3.0       
##  [79] bitops_1.0-6             R.oo_1.22.0             
##  [81] KernSmooth_2.23-15       DelayedMatrixStats_1.4.0
##  [83] doRNG_1.7.1              lars_1.2                
##  [85] stringr_1.3.1            scales_1.0.0            
##  [87] plyr_1.8.4               ica_1.0-2               
##  [89] bibtex_0.4.2             gdata_2.18.0            
##  [91] zlibbioc_1.26.0          compiler_3.5.2          
##  [93] HSMMSingleCell_1.2.0     lsei_1.2-0              
##  [95] bbmle_1.0.20             RColorBrewer_1.1-2      
##  [97] rrcov_1.4-4              fitdistrplus_1.0-11     
##  [99] cli_1.0.0                dtw_1.20-1              
## [101] XVector_0.22.0           pbapply_1.3-4           
## [103] htmlTable_1.12           Formula_1.2-3           
## [105] MASS_7.3-51.1            mgcv_1.8-26             
## [107] tidyselect_0.2.4         stringi_1.2.4           
## [109] forcats_0.3.0            densityClust_0.3        
## [111] yaml_2.2.0               latticeExtra_0.6-28     
## [113] ggrepel_0.8.0            grid_3.5.2              
## [115] tools_3.5.2              rio_0.5.10              
## [117] rstudioapi_0.7           foreach_1.4.4           
## [119] foreign_0.8-71           gridExtra_2.3           
## [121] smoother_1.1             scatterplot3d_0.3-41    
## [123] Rtsne_0.15               digest_0.6.18           
## [125] BiocManager_1.30.3       FNN_1.1.2.2             
## [127] shiny_1.1.0              qlcMatrix_0.9.7         
## [129] fpc_2.1-11.1             Rcpp_1.0.0              
## [131] car_3.0-2                SDMTools_1.1-221        
## [133] later_0.7.4              WriteXLS_4.1.0          
## [135] httr_1.3.1               npsurv_0.4-0            
## [137] kernlab_0.9-27           Rdpack_0.10-1           
## [139] colorspace_1.3-2         ranger_0.11.1           
## [141] reticulate_1.10          statmod_1.4.30          
## [143] sp_1.3-1                 flexmix_2.3-14          
## [145] xtable_1.8-3             jsonlite_1.5            
## [147] modeltools_0.2-22        destiny_2.12.0          
## [149] R6_2.2.2                 Hmisc_4.1-1             
## [151] pillar_1.3.0             htmltools_0.3.6         
## [153] mime_0.5                 glue_1.3.0              
## [155] VIM_4.8.0                DT_0.4                  
## [157] class_7.3-14             codetools_0.2-15        
## [159] utf8_1.1.4               tsne_0.1-3              
## [161] pcaPP_1.9-73             mvtnorm_1.0-8           
## [163] lattice_0.20-38          tibble_1.4.2            
## [165] mixtools_1.1.0           curl_3.2                
## [167] ggbeeswarm_0.6.0         gtools_3.8.1            
## [169] zip_1.0.0                openxlsx_4.1.0          
## [171] survival_2.43-3          limma_3.36.3            
## [173] rmarkdown_1.10           docopt_0.6.1            
## [175] fastICA_1.2-1            munsell_0.5.0           
## [177] e1071_1.7-0              rhdf5_2.26.1            
## [179] GenomeInfoDbData_1.1.0   iterators_1.0.10        
## [181] HDF5Array_1.10.1         haven_1.1.2             
## [183] reshape2_1.4.3           gtable_0.2.0